Here, we will apply a k-nearest neighbor (KNN) algorithm to classify the scATAC cells to a given cell type category with the help of our training set, the Multiome experiment. Remember, that KNN works on a basic assumption that data points of similar categories are closer to each other.
library(Seurat)
library(Signac)
library(flexclust)
library(tidyverse)
library(plyr)
library(harmony)
library(class)
library(ggplot2)
library(reshape2)
cell_type = "NBC_MBC"
# Paths
path_to_obj <- str_c(
here::here("scATAC-seq/results/R_objects/level_4/"),
cell_type,
"/",
cell_type,
"_integrated_level_4.rds",
sep = ""
)
path_to_obj_RNA <- str_c(
here::here("scRNA-seq/3-clustering/5-level_5/"),
cell_type,
"/",
cell_type,
"_seu_obj_level_5_eta_DZnoproli.rds",
sep = ""
)
path_to_level_4 <- here::here("scATAC-seq/results/R_objects/level_4/NBC_MBC/")
path_to_save <- str_c(path_to_level_4, "NBC_MBC_annotated_level_4.rds")
reduction <- "harmony"
dims <- 1:40
color_palette<- c("#72663f",
"#bca146",
"#75a5b0",
"#eae2c6",
"#588e1a",
"#71ccb7",
"#352f1d",
"#dcf0f4",
"#8C76B0",
"#95d1de",
"#52828D",
"#dbcc9b",
"#bca146",
"#69c291",
"#205b67",
"green")
cols_nbc <- c(
"NBC" = "#dcf0f4",
"NBC early activation" = "#95d1de",
"NBC IFN-activated" = "#52828D",
"NBC CD229+" = "#8C76B0",
"DZ-early S phase" = "#006FC4",
"Early GC-commited NBC" = "#75a5b0",
"GC-commited NBC" = "#71ccb7",
"preGC" = "#69c291",
"Proliferative NBC" = "#205b67",
"GC DZ Noproli" = "#588e1a",
"Early MBC" = "#eae2c6",
"ncsMBC" = "#dbcc9b",
"ncsMBC FCRL4+/FCRL5+" = "#bca146",
"csMBC" = "#72663f",
"csMBC FCRL4+/FCRL5+" = "#a3925a",
"MBC FCRL5+" = "#352f1d"
)
We need to load the scRNAseq annotation from Multiome experiment (cell barcode and cell-type assigned) and the integrated scATAC data. Note that there are 221 cells difference between scATAC and scRNA from multiome.
seurat <- readRDS(path_to_obj_RNA)
tonsil_RNA_annotation <- seurat@meta.data %>%
rownames_to_column(var = "cell_barcode") %>%
dplyr::filter(assay == "multiome") %>%
dplyr::select("cell_barcode", "names_level_5_clusters_eta")
head(tonsil_RNA_annotation)
## cell_barcode names_level_5_clusters_eta
## 1 co7dzuup_xuczw9vc_AAACATGCACACCAAC-1 GC-commited NBC
## 2 co7dzuup_xuczw9vc_AAACCAACAAGCTTAT-1 csMBC
## 3 co7dzuup_xuczw9vc_AAACCAACAGCTTACA-1 NBC
## 4 co7dzuup_xuczw9vc_AAACCGAAGTGAGGTA-1 NBC
## 5 co7dzuup_xuczw9vc_AAACCGCGTTGTAACG-1 csMBC
## 6 co7dzuup_xuczw9vc_AAACCGGCAAATACCT-1 ncsMBC FCRL4+/FCRL5+
seurat$names_level_5_clusters_eta <- factor(seurat$names_level_5_clusters_eta, levels = names(cols_nbc))
DimPlot(seurat,
group.by = "names_level_5_clusters_eta",
cols = cols_nbc,
pt.size = 0.1)
seurat_ATAC <- readRDS(path_to_obj)
seurat_ATAC
## An object of class Seurat
## 270784 features across 38569 samples within 1 assay
## Active assay: peaks_macs (270784 features, 258100 variable features)
## 3 dimensional reductions calculated: umap, lsi, harmony
p1 <- DimPlot(seurat_ATAC,
pt.size = 0.1)
p1
Annotation level 4 for scATAC will be defined “a priori” as unannotated and the scRNA annotation will be transfered to the scATAC-multiome cells based on the same cell barcode.
tonsil_scATAC_df <- data.frame(cell_barcode = colnames(seurat_ATAC)[seurat_ATAC$assay == "scATAC"])
tonsil_scATAC_df$names_level_5_clusters_eta <- "unannotated"
df_all <- rbind(tonsil_RNA_annotation,tonsil_scATAC_df)
rownames(df_all) <- df_all$cell_barcode
df_all <- df_all[colnames(seurat_ATAC), ]
seurat_ATAC$names_level_5_clusters_eta <- df_all$names_level_5_clusters_eta
seurat_ATAC@meta.data$annotation_prob <- 1
seurat_ATAC$names_level_5_clusters_eta[is.na(seurat_ATAC$names_level_5_clusters_eta)] <- "DZ-early S phase"
melt(table(seurat_ATAC$names_level_5_clusters_eta))
## Var1 value
## 1 csMBC 2408
## 2 csMBC FCRL4+/FCRL5+ 694
## 3 Early GC-commited NBC 273
## 4 Early MBC 95
## 5 GC DZ Noproli 2
## 6 GC-commited NBC 892
## 7 MBC FCRL5+ 529
## 8 NBC 4588
## 9 NBC CD229+ 310
## 10 NBC early activation 1736
## 11 NBC IFN-activated 105
## 12 ncsMBC 3388
## 13 ncsMBC FCRL4+/FCRL5+ 419
## 14 preGC 46
## 15 Proliferative NBC 10
## 16 unannotated 23074
table(is.na(seurat_ATAC$names_level_5_clusters_eta))
##
## FALSE
## 38569
seurat_ATAC$names_level_5_clusters_eta <- factor(seurat_ATAC$names_level_5_clusters_eta, levels = names(cols_nbc))
DimPlot(seurat_ATAC,
group.by = "names_level_5_clusters_eta",
split.by = "assay",
cols = cols_nbc,
pt.size = 0.5)
Data splicing basically involves splitting the data set into training and testing data set.
reference_cells <- colnames(seurat_ATAC)[seurat_ATAC$assay == "multiome"]
query_cells <- colnames(seurat_ATAC)[seurat_ATAC$assay == "scATAC"]
reduction_ref <- seurat_ATAC@reductions[[reduction]]@cell.embeddings[reference_cells, dims]
reduction_query <- seurat_ATAC@reductions[[reduction]]@cell.embeddings[query_cells, dims]
We’re going to calculate the number of observations in the training dataset that correspond to the Multiome data. The reason we’re doing this is that we want to initialize the value of ‘K’ in the KNN model. To do that, we split our training data in two part: a train.loan, that correspond to the random selection of the 70% of the training data and the test.loan, that is the remaining 30% of the data set. The first one is used to traint the system while the second is uses to evaluate the learned system.
dat.d <- sample(1:nrow(reduction_ref),
size=nrow(reduction_ref)*0.7,replace = FALSE)
train.loan <- reduction_ref[dat.d,] # 70% training data
test.loan <- reduction_ref[-dat.d,] # remaining 30% test data
train.loan_labels <- seurat_ATAC@meta.data[row.names(train.loan),]$names_level_5_clusters_eta
test.loan_labels <- seurat_ATAC@meta.data[row.names(test.loan),]$names_level_5_clusters_eta
k.optm <- c()
k.values <- c()
for (i in c(2,4,6,8,10,12,13,14,15,16,20,32,64,100)){
print(i)
knn.mod <- knn(train=train.loan, test=test.loan, cl=train.loan_labels, k=i)
k.optm <- c(k.optm, 100 * sum(test.loan_labels == knn.mod)/NROW(test.loan_labels))
k.values <- c(k.values,i)
}
## [1] 2
## [1] 4
## [1] 6
## [1] 8
## [1] 10
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 20
## [1] 32
## [1] 64
## [1] 100
Now we can plot the accuracy of the model taking in account a range of different K and selec the best one.
k.optim = data.frame(k.values,k.optm)
p3 <- ggplot(data=k.optim, aes(x=k.values, y=k.optm, group=1)) +
geom_line() +
geom_point() +
geom_vline(xintercept=12, linetype="dashed", color = "red")
p3
train.loan <- reduction_ref
test.loan <- reduction_query
train.loan_labels <- seurat_ATAC@meta.data[row.names(train.loan),]$names_level_5_clusters_eta
test.loan_labels <- seurat_ATAC@meta.data[row.names(test.loan),]$names_level_5_clusters_eta
knn.mod <- knn(train=train.loan, test=test.loan, cl=train.loan_labels, k=15, prob=T)
annotation_data <- data.frame(query_cells, knn.mod, attr(knn.mod,"prob"))
colnames(annotation_data) <- c("query_cells",
"names_level_5_clusters_eta",
"annotation_prob")
annotation_data$names_level_5_clusters_eta <- as.character(annotation_data$names_level_5_clusters_eta)
seurat_ATAC@meta.data[annotation_data$query_cells,]$names_level_5_clusters_eta <- annotation_data$names_level_5_clusters_eta
seurat_ATAC@meta.data[annotation_data$query_cells,]$annotation_prob <- annotation_data$annotation_prob
seurat_ATAC$names_level_5_clusters_eta <- factor(seurat_ATAC$names_level_5_clusters_eta)
DimPlot(
seurat_ATAC,
cols = cols_nbc,
label = T,
group.by = "names_level_5_clusters_eta",
pt.size = 0.1)
DimPlot(
cols = cols_nbc,
seurat_ATAC, reduction = "umap",
group.by = "names_level_5_clusters_eta",
pt.size = 0.1, split.by = "assay")
melt(table(seurat_ATAC$names_level_5_clusters_eta))
## Var1 value
## 1 NBC 17793
## 2 NBC early activation 2700
## 3 NBC IFN-activated 105
## 4 NBC CD229+ 344
## 5 Early GC-commited NBC 377
## 6 GC-commited NBC 1554
## 7 preGC 58
## 8 Proliferative NBC 10
## 9 GC DZ Noproli 2
## 10 Early MBC 129
## 11 ncsMBC 6584
## 12 ncsMBC FCRL4+/FCRL5+ 892
## 13 csMBC 5637
## 14 csMBC FCRL4+/FCRL5+ 1462
## 15 MBC FCRL5+ 922
#saveRDS(seurat_ATAC, path_to_save)
Note that the probability of the prediction was lower in the transitioning cells and in not-defined clusters.
seurat_ATAC_scATAC = subset(seurat_ATAC, assay == "scATAC")
FeaturePlot(
seurat_ATAC_scATAC, reduction = "umap",
features = "annotation_prob",
pt.size = 0.1)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Motif_TF/lib/libopenblasp-r0.3.10.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 grid stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape2_1.4.4 class_7.3-17 harmony_1.0 Rcpp_1.0.6 plyr_1.8.6 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4 readr_1.4.0 tidyr_1.1.3 tibble_3.1.2 ggplot2_3.3.5 tidyverse_1.3.0 flexclust_1.4-0 modeltools_0.2-23 lattice_0.20-41 Signac_1.2.1 SeuratObject_4.0.2 Seurat_4.0.3 BiocStyle_2.16.1
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.10 fastmatch_1.1-0 igraph_1.2.6 lazyeval_0.2.2 splines_4.0.3 BiocParallel_1.22.0 listenv_0.8.0 scattermore_0.7 SnowballC_0.7.0 GenomeInfoDb_1.24.2 digest_0.6.27 htmltools_0.5.1.1 fansi_0.5.0 magrittr_2.0.1 tensor_1.5 cluster_2.1.0 ROCR_1.0-11 globals_0.14.0 Biostrings_2.56.0 modelr_0.1.8 matrixStats_0.59.0 docopt_0.7.1 spatstat.sparse_2.0-0 colorspace_2.0-2 rvest_0.3.6 blob_1.2.1 ggrepel_0.9.1 haven_2.3.1 xfun_0.18 sparsesvd_0.2 crayon_1.4.1 RCurl_1.98-1.2 jsonlite_1.7.2 spatstat.data_2.1-0 survival_3.2-7 zoo_1.8-9 glue_1.4.2 polyclip_1.10-0 gtable_0.3.0 zlibbioc_1.34.0 XVector_0.28.0 leiden_0.3.8 future.apply_1.7.0 BiocGenerics_0.34.0 abind_1.4-5 scales_1.1.1 DBI_1.1.0 miniUI_0.1.1.1 viridisLite_0.4.0 xtable_1.8-4
## [52] reticulate_1.20 spatstat.core_2.2-0 htmlwidgets_1.5.3 httr_1.4.2 RColorBrewer_1.1-2 ellipsis_0.3.2 ica_1.0-2 pkgconfig_2.0.3 farver_2.1.0 dbplyr_1.4.4 ggseqlogo_0.1 uwot_0.1.10 deldir_0.2-10 here_1.0.1 utf8_1.2.1 labeling_0.4.2 tidyselect_1.1.1 rlang_0.4.11 later_1.2.0 cellranger_1.1.0 munsell_0.5.0 tools_4.0.3 cli_3.0.0 generics_0.1.0 broom_0.7.2 ggridges_0.5.3 evaluate_0.14 fastmap_1.1.0 yaml_2.2.1 goftest_1.2-2 fs_1.5.0 knitr_1.30 fitdistrplus_1.1-5 RANN_2.6.1 pbapply_1.4-3 future_1.21.0 nlme_3.1-150 mime_0.11 slam_0.1-47 RcppRoll_0.3.0 xml2_1.3.2 rstudioapi_0.11 compiler_4.0.3 plotly_4.9.4.1 png_0.1-7 spatstat.utils_2.2-0 reprex_0.3.0 tweenr_1.0.1 stringi_1.6.2 Matrix_1.3-4 vctrs_0.3.8
## [103] pillar_1.6.1 lifecycle_1.0.0 BiocManager_1.30.10 spatstat.geom_2.2-0 lmtest_0.9-38 RcppAnnoy_0.0.18 data.table_1.14.0 cowplot_1.1.1 bitops_1.0-7 irlba_2.3.3 httpuv_1.6.1 patchwork_1.1.1 GenomicRanges_1.40.0 R6_2.5.0 bookdown_0.21 promises_1.2.0.1 KernSmooth_2.23-17 gridExtra_2.3 lsa_0.73.2 IRanges_2.22.1 parallelly_1.26.1 codetools_0.2-17 MASS_7.3-53 assertthat_0.2.1 rprojroot_2.0.2 withr_2.4.2 qlcMatrix_0.9.7 sctransform_0.3.2 Rsamtools_2.4.0 S4Vectors_0.26.0 GenomeInfoDbData_1.2.3 hms_0.5.3 mgcv_1.8-33 parallel_4.0.3 rpart_4.1-15 rmarkdown_2.5 Rtsne_0.15 ggforce_0.3.2 lubridate_1.7.9 shiny_1.6.0